1. Outlier

set.seed(2664)

data = fread("data_weather.csv") %>%
  mutate(pick = ymd_hms(pick), drop = ymd_hms(drop), duration = as.numeric(drop - pick)) %>%
  as_tibble() %>% filter(duration < 800) %>% mutate(log_dura = log(.$duration))

origin_data = data

upper = data %>%
  filter(droplocation == "JFK") %>%
  select(duration) %>%
  boxplot() %>%
  .$stats %>% .[5,]

data = data %>%
  filter(duration > 15 & duration < upper) %>%
  mutate(log_dura = log(.$duration))

data = data %>%
  mutate(index = 1:nrow(data)) %>%
  group_by(DATE) %>%
  filter(index %in% sample(index,20))

2. Compare two arirports

summary(aov(log_dura ~ droplocation, data))
##                Df Sum Sq Mean Sq F value Pr(>F)    
## droplocation    1  409.4   409.4    5887 <2e-16 ***
## Residuals    6858  477.0     0.1                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3. split data

JFK = data %>% filter(droplocation == "JFK")
LAG = data %>% filter(droplocation == "LAG")

4. split time window

model_wh_JFK = JFK %>%
  mutate(time = ifelse(hour(pick) %in% c(0:6, 23), 1, ifelse(hour(pick) %in% c(7:10, 16:19), 2, 3)),
         weekday = ifelse(weekdays(pick) %in% c("Saturday", "Sunday"), 1,
                          ifelse(weekdays(pick) %in% c("Monday","Tuesday","Wednesday"), 2,3))) 
model_wh_JFK$time = factor(model_wh_JFK$time)
model_wh_JFK$weekday = factor(model_wh_JFK$weekday)

model_wh_LAG = LAG %>%
  mutate(time = ifelse(hour(pick) %in% c(0:6, 23), 1, ifelse(hour(pick) %in% c(7:10, 16:19), 2, 3)),
         weekday = ifelse(weekdays(pick) %in% c("Saturday", "Sunday"), 1,
                          ifelse(weekdays(pick) %in% c("Monday","Tuesday","Wednesday"), 2, 3)))
model_wh_LAG$time = factor(model_wh_LAG$time)
model_wh_LAG$weekday = factor(model_wh_LAG$weekday)

5. interaction plot

interaction.plot(model_wh_JFK$weekday,model_wh_JFK$time,model_wh_JFK$log_dura,
                 main = "Interaction Plot for JFK",
                 xlab = "Weekday", ylab = "log of duration", trace.label = "Time Zone")

interaction.plot(model_wh_LAG$weekday,model_wh_LAG$time,model_wh_LAG$log_dura,
                 main = "Interaction Plot for LAG",
                 xlab = "Weekday", ylab = "log of duration", trace.label = "Time Zone")

6. type III two way ANOVA

library(car)
lm_wh_JFK <- lm(log_dura ~ time * weekday, contrasts = list(time=contr.sum, weekday=contr.sum), data = model_wh_JFK)
Anova(lm_wh_JFK, type=3)
## Anova Table (Type III tests)
## 
## Response: log_dura
##              Sum Sq   Df    F value    Pr(>F)    
## (Intercept)   43280    1 917596.233 < 2.2e-16 ***
## time             52    2    549.239 < 2.2e-16 ***
## weekday           9    2     98.712 < 2.2e-16 ***
## time:weekday      4    4     20.781 < 2.2e-16 ***
## Residuals       148 3134                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm_wh_LAG <- lm(log_dura ~ time * weekday, contrasts = list(time=contr.sum, weekday=contr.sum), data = model_wh_LAG)
Anova(lm_wh_LAG, type=3)
## Anova Table (Type III tests)
## 
## Response: log_dura
##              Sum Sq   Df   F value    Pr(>F)    
## (Intercept)   38056    1 795529.73 < 2.2e-16 ***
## time             57    2    593.92 < 2.2e-16 ***
## weekday          14    2    144.82 < 2.2e-16 ***
## time:weekday      3    4     18.16  8.77e-15 ***
## Residuals       177 3708                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7. multiple comparison for interaction

model_wh_JFK$comb = paste(as.character(model_wh_JFK$time),as.character(model_wh_JFK$weekday)) %>% gsub(" ", "", .) %>% factor()
anova_JFK <- aov(log_dura ~ time + weekday + comb, model_wh_JFK)
agricolae::HSD.test(anova_JFK, "comb", alpha = 0.05)$group
##    log_dura groups
## 23 4.032461      a
## 33 3.995427     ab
## 22 3.969308      b
## 32 3.963610      b
## 31 3.904146      c
## 21 3.780664      d
## 13 3.692944      e
## 12 3.632236      f
## 11 3.607276      f
model_wh_LAG$comb = paste(as.character(model_wh_LAG$time),as.character(model_wh_LAG$weekday)) %>% gsub(" ", "", .) %>% factor()
anova_LAG <- aov(log_dura ~ time + weekday + comb, model_wh_LAG)
agricolae::HSD.test(anova_LAG, "comb", alpha = 0.05)$group
##    log_dura groups
## 33 3.561377      a
## 23 3.544810      a
## 32 3.471657      b
## 22 3.463655      b
## 31 3.393732      c
## 21 3.283494      d
## 13 3.206756      e
## 12 3.176610     ef
## 11 3.132125      f

8. linear regression

comb_data <- rbind(model_wh_JFK,model_wh_LAG)
comb_data$drop_num <- factor(ifelse(comb_data$droplocation == "JFK", "1", "0"))
fit <- lm(log_dura ~ drop_num + time * weekday, comb_data)
summary(fit)
## 
## Call:
## lm(formula = log_dura ~ drop_num + time * weekday, data = comb_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63695 -0.15438 -0.02263  0.13393  1.09774 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.127933   0.010647 293.775  < 2e-16 ***
## drop_num1      0.484142   0.005322  90.968  < 2e-16 ***
## time2          0.162373   0.012664  12.822  < 2e-16 ***
## time3          0.280144   0.013575  20.636  < 2e-16 ***
## weekday2       0.037893   0.012472   3.038  0.00239 ** 
## weekday3       0.079711   0.013598   5.862 4.78e-09 ***
## time2:weekday2 0.144597   0.015857   9.119  < 2e-16 ***
## time3:weekday2 0.029698   0.016990   1.748  0.08051 .  
## time2:weekday3 0.176176   0.017416  10.116  < 2e-16 ***
## time3:weekday3 0.049437   0.018414   2.685  0.00728 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2182 on 6850 degrees of freedom
## Multiple R-squared:  0.6321, Adjusted R-squared:  0.6317 
## F-statistic:  1308 on 9 and 6850 DF,  p-value: < 2.2e-16

9. plot

# histogram
p1 = origin_data %>% mutate(log.dur = log(duration)) %>%
  ggplot(.,aes(x = log.dur))+
  geom_histogram(color="black", fill="steelblue",binwidth = 0.1)+
  scale_x_continuous(name="Log of duration")+
  theme_bw(14)

# histogram
p2 = origin_data %>%
ggplot(.,aes(x = duration))+
  geom_histogram(color="black", fill="steelblue",binwidth = 5)+
  scale_x_continuous(name="Duration")+
  theme_bw(14)

ggarrange(p2,p1,nrow = 2, ncol = 1)

# boxplot
origin_data %>%
  plot_ly(., x = ~duration, type = "box") %>%
  layout(title = "Boxplot of Durations", xaxis = list(title = "Durations"))
# smooth line
p3 = data %>% group_by(Rain) %>% summarise(duration.avg = mean(log_dura)) %>%
ggplot()+
  geom_smooth(aes(Rain, duration.avg), method = "loess")+
  xlab("Rain Levels")+
  ylab("Log of Duration (Min)")+
  ylim(3,4)+
  ggtitle("Average Duration VS Rain Effect")+
  theme_bw(14)

# smooth line
p4 = data %>%
  ggplot()+
  geom_smooth(aes(TMAX, log_dura), method = "loess", col = "red", se = F)+
  xlab("Temperature")+
  ylab("Log of Duration (Min)")+
  ylim(3.25,3.75)+
  ggtitle("Average Duration VS Temperature Effect")+
  theme_bw(14)

grid.arrange(p3,p4)

10. Prediction based on our model

new = data.frame(drop_num = "1", time = "1", weekday = "1")
y.pred = exp(predict(fit, newdata = new, type  = "response", interval = "predict"))
y.pred
##        fit      lwr      upr
## 1 37.04285 24.13981 56.84273